This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

# library for LMM 

library(lme4)
Loading required package: Matrix
library(lmerTest)

Attaching package: ‘lmerTest’

The following object is masked from ‘package:lme4’:

    lmer

The following object is masked from ‘package:stats’:

    step
library(car)
Loading required package: carData
df<-read.csv("input/scores_commits.csv", header =TRUE, sep=",")
df <- df[complete.cases(df), ]                  # Apply complete.cases function
df
# convert to nominal factor
df$Group = factor(df$Group)
df$phase = factor(df$phase)
df$log_novelty <- log(df$novelty+1) 
df$log_user_requirement <- log(df$user.requirement+1)
df$log_infovis <- log(df$infovis+1)
df$log_total <- log(df$total+1)
df$log_count <- log(df$count+1)
df$Q7_Q7_1 <- log(df$Q7_Q7_1+1)
df$Q7_Q7_2 <- log(df$Q7_Q7_2+1)
df$Q8_Q8_1 <- log(df$Q8_Q8_1+1)
df$Q10 <- log(df$Q10+1)
# standardizing variables for skills and aspirations. 
cols <- c("Q7_Q7_1", "Q7_Q7_2", "Q8_Q8_1", "Q10", "log_novelty", "log_user_requirement", "log_infovis", "log_total", "log_count")
df[cols] <- scale(df[cols])
df
mod.reduce.novelty <- lmer( log_novelty ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 + ( 1 | phase ), data = df, REML = FALSE)
summary(mod.reduce.novelty)
Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's method ['lmerModLmerTest']
Formula: log_novelty ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 +      (1 | phase)
   Data: df

     AIC      BIC   logLik deviance df.resid 
  2103.8   2141.3  -1043.9   2087.8      792 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-2.2633 -0.8860  0.1105  0.8440  2.1308 

Random effects:
 Groups   Name        Variance Std.Dev.
 phase    (Intercept) 0.07241  0.2691  
 Residual             0.78242  0.8845  
Number of obs: 800, groups:  phase, 5

Fixed effects:
              Estimate Std. Error         df t value Pr(>|t|)    
(Intercept) -9.165e-15  1.243e-01  4.992e+00   0.000  1.00000    
log_count    3.257e-01  3.179e-02  7.962e+02  10.244  < 2e-16 ***
Q7_Q7_1     -1.935e-01  3.929e-02  7.950e+02  -4.924 1.03e-06 ***
Q7_Q7_2      1.787e-01  4.027e-02  7.950e+02   4.439 1.03e-05 ***
Q8_Q8_1      1.818e-03  3.459e-02  7.950e+02   0.053  0.95811    
Q10          1.014e-01  3.375e-02  7.950e+02   3.004  0.00275 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Effects:
          (Intr) lg_cnt Q7_Q7_1 Q7_Q7_2 Q8_Q8_
log_count  0.000                              
Q7_Q7_1    0.000  0.092                       
Q7_Q7_2    0.000 -0.050 -0.556                
Q8_Q8_1    0.000 -0.062 -0.085  -0.167        
Q10        0.000 -0.066 -0.021  -0.120  -0.273
AIC(mod.reduce.novelty)
[1] 2103.817
BIC(mod.reduce.novelty)
[1] 2141.294
mod.full.novelty <- lmer( log_novelty ~ factor(Group) + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 + (1 | phase), data = df, REML = FALSE)
summary(mod.full.novelty)
Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's method ['lmerModLmerTest']
Formula: log_novelty ~ factor(Group) + log_count + Q7_Q7_1 + Q7_Q7_2 +      Q8_Q8_1 + Q10 + (1 | phase)
   Data: df

     AIC      BIC   logLik deviance df.resid 
  2102.5   2154.0  -1040.2   2080.5      789 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-2.3873 -0.9114  0.1088  0.8567  2.1940 

Random effects:
 Groups   Name        Variance Std.Dev.
 phase    (Intercept) 0.07219  0.2687  
 Residual             0.77526  0.8805  
Number of obs: 800, groups:  phase, 5

Fixed effects:
                 Estimate Std. Error         df t value Pr(>|t|)    
(Intercept)     -0.127049   0.136604   7.320729  -0.930  0.38199    
factor(Group)1   0.232085   0.090159 794.997658   2.574  0.01023 *  
factor(Group)2   0.167314   0.090050 794.992569   1.858  0.06354 .  
factor(Group)3   0.096403   0.089581 794.994619   1.076  0.28219    
log_count        0.321371   0.031722 796.231055  10.131  < 2e-16 ***
Q7_Q7_1         -0.195842   0.039288 795.004690  -4.985 7.62e-07 ***
Q7_Q7_2          0.176265   0.040328 794.996130   4.371 1.40e-05 ***
Q8_Q8_1         -0.000345   0.034500 794.997224  -0.010  0.99202    
Q10              0.097272   0.034151 794.996547   2.848  0.00451 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Effects:
            (Intr) fc(G)1 fc(G)2 fc(G)3 lg_cnt Q7_Q7_1 Q7_Q7_2 Q8_Q8_
factr(Grp)1 -0.344                                                   
factr(Grp)2 -0.344  0.512                                            
factr(Grp)3 -0.346  0.526  0.525                                     
log_count    0.019 -0.063 -0.011 -0.041                              
Q7_Q7_1      0.002 -0.046  0.046 -0.012  0.096                       
Q7_Q7_2      0.015  0.008 -0.088 -0.008 -0.053 -0.560                
Q8_Q8_1      0.022 -0.022 -0.050 -0.056 -0.060 -0.086  -0.162        
Q10         -0.005 -0.082  0.095  0.018 -0.056 -0.004  -0.135  -0.274
AIC(mod.full.novelty)
[1] 2102.49
BIC(mod.full.novelty)
[1] 2154.021
anova(mod.full.novelty)
Type III Analysis of Variance Table with Satterthwaite's method
              Sum Sq Mean Sq NumDF  DenDF  F value    Pr(>F)    
factor(Group)  5.706   1.902     3 794.99   2.4534  0.062061 .  
log_count     79.569  79.569     1 796.23 102.6344 < 2.2e-16 ***
Q7_Q7_1       19.264  19.264     1 795.00  24.8486 7.615e-07 ***
Q7_Q7_2       14.810  14.810     1 795.00  19.1037 1.402e-05 ***
Q8_Q8_1        0.000   0.000     1 795.00   0.0001  0.992024    
Q10            6.289   6.289     1 795.00   8.1126  0.004509 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
anova(mod.reduce.novelty, mod.full.novelty)
Data: df
Models:
mod.reduce.novelty: log_novelty ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 + (1 | phase)
mod.full.novelty: log_novelty ~ factor(Group) + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 + (1 | phase)
                   npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)  
mod.reduce.novelty    8 2103.8 2141.3 -1043.9   2087.8                       
mod.full.novelty     11 2102.5 2154.0 -1040.2   2080.5 7.3266  3    0.06219 .
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
library(ALSM)
Loading required package: leaps
Loading required package: SuppDists
step(lm(log_novelty ~ factor(Group) + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10, data=df),
method="both", trace = 1 )
Start:  AIC=-115.17
log_novelty ~ factor(Group) + log_count + Q7_Q7_1 + Q7_Q7_2 + 
    Q8_Q8_1 + Q10

                Df Sum of Sq    RSS      AIC
- Q8_Q8_1        1     0.002 677.32 -117.172
<none>                       677.32 -115.173
- factor(Group)  3     5.900 683.22 -114.235
- Q10            1     6.515 683.84 -109.515
- Q7_Q7_2        1    15.138 692.46  -99.490
- Q7_Q7_1        1    19.947 697.27  -93.954
- log_count      1    66.787 744.11  -41.940

Step:  AIC=-117.17
log_novelty ~ factor(Group) + log_count + Q7_Q7_1 + Q7_Q7_2 + 
    Q10

                Df Sum of Sq    RSS      AIC
<none>                       677.32 -117.172
- factor(Group)  3     5.908 683.23 -116.224
- Q10            1     7.106 684.43 -110.823
- Q7_Q7_2        1    15.596 692.92 -100.960
- Q7_Q7_1        1    20.066 697.39  -95.815
- log_count      1    67.067 744.39  -43.638

Call:
lm(formula = log_novelty ~ factor(Group) + log_count + Q7_Q7_1 + 
    Q7_Q7_2 + Q10, data = df)

Coefficients:
   (Intercept)  factor(Group)1  factor(Group)2  factor(Group)3       log_count         Q7_Q7_1         Q7_Q7_2  
      -0.12958         0.23732         0.16843         0.09995         0.29271        -0.19912         0.17849  
           Q10  
       0.09942  
mod.reduce.ur <- lm(log_user_requirement ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 , data = df)
summary(mod.reduce.ur)

Call:
lm(formula = log_user_requirement ~ log_count + Q7_Q7_1 + Q7_Q7_2 + 
    Q8_Q8_1 + Q10, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.9234 -0.9854  0.3754  0.7441  1.6632 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -3.509e-16  3.348e-02   0.000 1.000000    
log_count    2.476e-01  3.383e-02   7.319 6.11e-13 ***
Q7_Q7_1     -1.897e-01  4.207e-02  -4.509 7.51e-06 ***
Q7_Q7_2      1.157e-01  4.311e-02   2.685 0.007412 ** 
Q8_Q8_1     -4.640e-03  3.703e-02  -0.125 0.900327    
Q10          1.194e-01  3.613e-02   3.305 0.000993 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.9471 on 794 degrees of freedom
Multiple R-squared:  0.1086,    Adjusted R-squared:  0.103 
F-statistic: 19.36 on 5 and 794 DF,  p-value: < 2.2e-16
AIC(mod.reduce.ur)
[1] 2191.288
BIC(mod.reduce.ur)
[1] 2224.081
mod.full.ur <- lm(log_user_requirement ~ factor(Group) + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10, data = df)
summary(mod.full.ur)

Call:
lm(formula = log_user_requirement ~ factor(Group) + log_count + 
    Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-2.0497 -0.9697  0.3109  0.7168  1.7587 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)    -0.20771    0.06930  -2.997 0.002809 ** 
factor(Group)1  0.30843    0.09616   3.207 0.001393 ** 
factor(Group)2  0.15234    0.09604   1.586 0.113097    
factor(Group)3  0.34981    0.09554   3.661 0.000268 ***
log_count       0.23922    0.03363   7.114 2.53e-12 ***
Q7_Q7_1        -0.19761    0.04190  -4.716 2.84e-06 ***
Q7_Q7_2         0.11979    0.04301   2.785 0.005482 ** 
Q8_Q8_1        -0.01073    0.03680  -0.292 0.770735    
Q10             0.11094    0.03642   3.046 0.002398 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.9391 on 791 degrees of freedom
Multiple R-squared:  0.1269,    Adjusted R-squared:  0.1181 
F-statistic: 14.37 on 8 and 791 DF,  p-value: < 2.2e-16
AIC(mod.full.ur)
[1] 2180.715
BIC(mod.full.ur)
[1] 2227.561
anova(mod.reduce.ur, mod.full.ur)
Analysis of Variance Table

Model 1: log_user_requirement ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + 
    Q10
Model 2: log_user_requirement ~ factor(Group) + log_count + Q7_Q7_1 + 
    Q7_Q7_2 + Q8_Q8_1 + Q10
  Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
1    794 712.19                                  
2    791 697.59  3    14.602 5.5192 0.0009401 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
library(ALSM)
step(lm(log_user_requirement ~ factor(Group) + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10, data=df),
method="both", trace = 1 )
Start:  AIC=-91.59
log_user_requirement ~ factor(Group) + log_count + Q7_Q7_1 + 
    Q7_Q7_2 + Q8_Q8_1 + Q10

                Df Sum of Sq    RSS     AIC
- Q8_Q8_1        1     0.075 697.66 -93.500
<none>                       697.59 -91.586
- Q7_Q7_2        1     6.840 704.43 -85.780
- Q10            1     8.181 705.77 -84.258
- factor(Group)  3    14.602 712.19 -81.013
- Q7_Q7_1        1    19.617 717.20 -71.400
- log_count      1    44.633 742.22 -43.971

Step:  AIC=-93.5
log_user_requirement ~ factor(Group) + log_count + Q7_Q7_1 + 
    Q7_Q7_2 + Q10

                Df Sum of Sq    RSS     AIC
<none>                       697.66 -93.500
- Q7_Q7_2        1     6.788 704.45 -87.755
- Q10            1     8.389 706.05 -85.938
- factor(Group)  3    14.541 712.20 -82.997
- Q7_Q7_1        1    19.975 717.64 -72.917
- log_count      1    44.575 742.24 -45.954

Call:
lm(formula = log_user_requirement ~ factor(Group) + log_count + 
    Q7_Q7_1 + Q7_Q7_2 + Q10, data = df)

Coefficients:
   (Intercept)  factor(Group)1  factor(Group)2  factor(Group)3       log_count         Q7_Q7_1         Q7_Q7_2  
       -0.2068          0.3078          0.1509          0.3482          0.2386         -0.1987          0.1178  
           Q10  
        0.1080  
mod.reduce.vis <- lmer( log_infovis ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 + ( 1 | phase) , data = df, REML = FALSE)
summary(mod.reduce.vis)
Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's method ['lmerModLmerTest']
Formula: log_infovis ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 +      (1 | phase)
   Data: df

     AIC      BIC   logLik deviance df.resid 
  2124.2   2161.7  -1054.1   2108.2      792 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-3.2613 -0.1514  0.3040  0.6139  1.7276 

Random effects:
 Groups   Name        Variance Std.Dev.
 phase    (Intercept) 0.04898  0.2213  
 Residual             0.80459  0.8970  
Number of obs: 800, groups:  phase, 5

Fixed effects:
              Estimate Std. Error         df t value Pr(>|t|)    
(Intercept) -3.387e-15  1.039e-01  4.990e+00   0.000  1.00000    
log_count    3.267e-01  3.223e-02  7.968e+02  10.137  < 2e-16 ***
Q7_Q7_1     -1.748e-01  3.984e-02  7.950e+02  -4.386 1.31e-05 ***
Q7_Q7_2      2.028e-01  4.083e-02  7.950e+02   4.965 8.39e-07 ***
Q8_Q8_1     -7.557e-02  3.507e-02  7.950e+02  -2.155  0.03149 *  
Q10          1.007e-01  3.422e-02  7.950e+02   2.944  0.00334 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Effects:
          (Intr) lg_cnt Q7_Q7_1 Q7_Q7_2 Q8_Q8_
log_count  0.000                              
Q7_Q7_1    0.000  0.092                       
Q7_Q7_2    0.000 -0.050 -0.556                
Q8_Q8_1    0.000 -0.062 -0.085  -0.167        
Q10        0.000 -0.066 -0.021  -0.120  -0.273
AIC(mod.reduce.vis)
[1] 2124.233
BIC(mod.reduce.vis)
[1] 2161.709
mod.full.vis <- lmer( log_infovis ~ factor(Group) + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 + ( 1 | phase) , data = df, REML = FALSE)
summary(mod.full.vis)
Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's method ['lmerModLmerTest']
Formula: log_infovis ~ factor(Group) + log_count + Q7_Q7_1 + Q7_Q7_2 +      Q8_Q8_1 + Q10 + (1 | phase)
   Data: df

     AIC      BIC   logLik deviance df.resid 
  2107.2   2158.8  -1042.6   2085.2      789 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-3.4115 -0.1553  0.3094  0.6174  1.6554 

Random effects:
 Groups   Name        Variance Std.Dev.
 phase    (Intercept) 0.04873  0.2207  
 Residual             0.78168  0.8841  
Number of obs: 800, groups:  phase, 5

Fixed effects:
                Estimate Std. Error        df t value Pr(>|t|)    
(Intercept)     -0.27038    0.11833   8.50468  -2.285 0.049801 *  
factor(Group)1   0.36811    0.09053 794.99787   4.066 5.26e-05 ***
factor(Group)2   0.30044    0.09042 794.99050   3.323 0.000932 ***
factor(Group)3   0.38658    0.08995 794.99347   4.298 1.94e-05 ***
log_count        0.31844    0.03185 796.72863   9.999  < 2e-16 ***
Q7_Q7_1         -0.17878    0.03945 795.00805  -4.532 6.74e-06 ***
Q7_Q7_2          0.19995    0.04049 794.99566   4.938 9.64e-07 ***
Q8_Q8_1         -0.08388    0.03464 794.99724  -2.421 0.015682 *  
Q10              0.09844    0.03429 794.99626   2.871 0.004207 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Effects:
            (Intr) fc(G)1 fc(G)2 fc(G)3 lg_cnt Q7_Q7_1 Q7_Q7_2 Q8_Q8_
factr(Grp)1 -0.399                                                   
factr(Grp)2 -0.398  0.512                                            
factr(Grp)3 -0.401  0.526  0.525                                     
log_count    0.022 -0.063 -0.011 -0.041                              
Q7_Q7_1      0.002 -0.046  0.046 -0.012  0.096                       
Q7_Q7_2      0.017  0.008 -0.088 -0.008 -0.053 -0.560                
Q8_Q8_1      0.025 -0.022 -0.050 -0.056 -0.060 -0.086  -0.162        
Q10         -0.006 -0.082  0.095  0.018 -0.056 -0.004  -0.135  -0.274
AIC(mod.full.vis)
[1] 2107.234
BIC(mod.full.vis)
[1] 2158.764
anova(mod.reduce.vis, mod.full.vis)
Data: df
Models:
mod.reduce.vis: log_infovis ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 + (1 | phase)
mod.full.vis: log_infovis ~ factor(Group) + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 + (1 | phase)
               npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)    
mod.reduce.vis    8 2124.2 2161.7 -1054.1   2108.2                         
mod.full.vis     11 2107.2 2158.8 -1042.6   2085.2 22.999  3   4.04e-05 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
library(ALSM)
step(lm(log_infovis ~ Group + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10, data=df),
method="both", trace = 1 )
Start:  AIC=-131.12
log_infovis ~ Group + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + 
    Q10

            Df Sum of Sq    RSS      AIC
<none>                   663.96 -131.117
- Q8_Q8_1    1     4.428 668.38 -127.799
- Q10        1     6.615 670.57 -125.186
- Group      3    18.523 682.48 -115.104
- Q7_Q7_1    1    16.528 680.48 -113.446
- Q7_Q7_2    1    19.340 683.30 -110.147
- log_count  1    68.596 732.55  -54.463

Call:
lm(formula = log_infovis ~ Group + log_count + Q7_Q7_1 + Q7_Q7_2 + 
    Q8_Q8_1 + Q10, data = df)

Coefficients:
(Intercept)       Group1       Group2       Group3    log_count      Q7_Q7_1      Q7_Q7_2      Q8_Q8_1  
   -0.27221      0.37203      0.30113      0.38910      0.29656     -0.18139      0.20142     -0.08245  
        Q10  
    0.09975  
mod.reduce.total <- lm( log_total ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 , data = df)
summary(mod.reduce.total)

Call:
lm(formula = log_total ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + 
    Q10, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.1861 -0.1993  0.2443  0.5703  1.4738 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -1.422e-16  3.222e-02   0.000 1.000000    
log_count    3.419e-01  3.255e-02  10.503  < 2e-16 ***
Q7_Q7_1     -1.852e-01  4.048e-02  -4.576 5.51e-06 ***
Q7_Q7_2      1.875e-01  4.148e-02   4.520 7.12e-06 ***
Q8_Q8_1     -8.656e-02  3.563e-02  -2.429 0.015349 *  
Q10          1.241e-01  3.477e-02   3.570 0.000378 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.9113 on 794 degrees of freedom
Multiple R-squared:  0.1748,    Adjusted R-squared:  0.1696 
F-statistic: 33.63 on 5 and 794 DF,  p-value: < 2.2e-16
AIC(mod.reduce.total)
[1] 2129.645
BIC(mod.reduce.total)
[1] 2162.437
mod.full.total <- lm( log_total ~ Group + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 , data = df)
summary(mod.full.total)

Call:
lm(formula = log_total ~ Group + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + 
    Q10, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-2.9973 -0.1279  0.2773  0.5482  1.4041 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -0.29834    0.07061  -4.225 2.66e-05 ***
Group1       0.43251    0.09786   4.420 1.13e-05 ***
Group2       0.28093    0.09793   2.869 0.004231 ** 
Group3       0.45080    0.09734   4.631 4.25e-06 ***
Q7_Q7_1     -0.23066    0.04253  -5.424 7.76e-08 ***
Q7_Q7_2      0.20946    0.04380   4.783 2.06e-06 ***
Q8_Q8_1     -0.07307    0.03745  -1.951 0.051416 .  
Q10          0.13970    0.03708   3.767 0.000177 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.9576 on 792 degrees of freedom
Multiple R-squared:  0.09105,   Adjusted R-squared:  0.08302 
F-statistic: 11.33 on 7 and 792 DF,  p-value: 9.643e-14
AIC(mod.full.total)
[1] 2210.925
BIC(mod.full.total)
[1] 2253.087
anova(mod.reduce.total, mod.full.total)
Analysis of Variance Table

Model 1: log_total ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10
Model 2: log_total ~ Group + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10
  Res.Df    RSS Df Sum of Sq F Pr(>F)
1    794 659.37                      
2    792 726.25  2   -66.874         
# convert to nominal factor
df$Group = factor(df$Group)
df$phase = factor(df$phase)
library(plyr)
ddply(df, ~ Group * phase, function(data) summary(data$log_novelty) )
ddply(df, ~ Group * phase, summarise, log_novelty.mean=mean(log_novelty), log_novelty.sd = sd(log_novelty))
# histograms for two factors
boxplot(log_novelty ~ Group * phase, data = df, xlab="Group.Phase", ylab="log_novelty")

with(df, interaction.plot(Group, phase, log_novelty, ylim=c(0, max(log_novelty)))) # interaction plot

# histograms for two factors
boxplot(log_novelty ~ Group * phase, data = df, xlab="Group.Phase", ylab="log_user_requirement")

with(df, interaction.plot(Group, phase, log_user_requirement, ylim=c(0, max(log_user_requirement)))) # interaction plot

# histograms for two factors
boxplot(log_novelty ~ Group * phase, data = df, xlab="Group.Phase", ylab="log_infovis")

with(df, interaction.plot(Group, phase, log_infovis, ylim=c(0, max(log_infovis)))) # interaction plot

# histograms for two factors
boxplot(log_novelty ~ Group * phase, data = df, xlab="Group.Phase", ylab="log_total")

with(df, interaction.plot(Group, phase, log_total, ylim=c(0, max(log_total)))) # interaction plot

m = lmer(log_novelty ~ Group + (1|Student), data=df, REML=FALSE)
summary(m)
Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's method ['lmerModLmerTest']
Formula: log_novelty ~ Group + (1 | Student)
   Data: df

     AIC      BIC   logLik deviance df.resid 
  2046.0   2074.1  -1017.0   2034.0      794 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-2.2669 -0.6407  0.1015  0.6356  2.3143 

Random effects:
 Groups   Name        Variance Std.Dev.
 Student  (Intercept) 0.4473   0.6688  
 Residual             0.5366   0.7325  
Number of obs: 800, groups:  Student, 159

Fixed effects:
            Estimate Std. Error       df t value Pr(>|t|)  
(Intercept)  -0.1667     0.1224 159.4644  -1.361    0.175  
Group1        0.2933     0.1698 159.0131   1.728    0.086 .
Group2        0.1952     0.1689 159.4644   1.156    0.249  
Group3        0.1401     0.1689 159.4644   0.829    0.408  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Effects:
       (Intr) Group1 Group2
Group1 -0.721              
Group2 -0.725  0.523       
Group3 -0.725  0.523  0.526
plot(resid(m, type = "pearson") ~ fitted(m))

qqnorm(resid(m, type = "pearson"))
qqline(resid(m, type = "pearson"))

# library for LMM we will use on relational log_novelty 

library(lme4)
library(lmerTest)
library(car)

set sum-to-zero contrast for ANOVA cells

contrasts(df$Group) <= "contr.sum"
     1    2    3
0 TRUE TRUE TRUE
1 TRUE TRUE TRUE
2 TRUE TRUE TRUE
3 TRUE TRUE TRUE
contrasts(df$phase) <= "contr.sum"
     2    3    4    5
1 TRUE TRUE TRUE TRUE
2 TRUE TRUE TRUE TRUE
3 TRUE TRUE TRUE TRUE
4 TRUE TRUE TRUE TRUE
5 TRUE TRUE TRUE TRUE
# phase is nested within group 
fit <- lm(log_total ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 , data = df)
summary(fit)

Call:
lm(formula = log_total ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + 
    Q10, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.1861 -0.1993  0.2443  0.5703  1.4738 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -1.422e-16  3.222e-02   0.000 1.000000    
log_count    3.419e-01  3.255e-02  10.503  < 2e-16 ***
Q7_Q7_1     -1.852e-01  4.048e-02  -4.576 5.51e-06 ***
Q7_Q7_2      1.875e-01  4.148e-02   4.520 7.12e-06 ***
Q8_Q8_1     -8.656e-02  3.563e-02  -2.429 0.015349 *  
Q10          1.241e-01  3.477e-02   3.570 0.000378 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.9113 on 794 degrees of freedom
Multiple R-squared:  0.1748,    Adjusted R-squared:  0.1696 
F-statistic: 33.63 on 5 and 794 DF,  p-value: < 2.2e-16
library(multcomp)
Loading required package: mvtnorm
Loading required package: survival
Loading required package: TH.data
Loading required package: MASS

Attaching package: ‘TH.data’

The following object is masked from ‘package:MASS’:

    geyser
library(lsmeans)
Loading required package: emmeans
The 'lsmeans' package is now basically a front end for 'emmeans'.
Users are encouraged to switch the rest of the way.
See help('transition') for more information, including how to
convert old 'lsmeans' objects and scripts to work with 'emmeans'.
#summary(glht(fit, lsm(pairwise ~ roup / phase)), test = adjusted(type='holm'))
fit.full <- lm(log_total ~ Group  + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10, data = df)
summary(fit.full)

Call:
lm(formula = log_total ~ Group + log_count + Q7_Q7_1 + Q7_Q7_2 + 
    Q8_Q8_1 + Q10, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.1191 -0.2201  0.2313  0.5645  1.3650 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -0.27052    0.06637  -4.076 5.04e-05 ***
Group1       0.37292    0.09209   4.050 5.64e-05 ***
Group2       0.27040    0.09198   2.940 0.003380 ** 
Group3       0.41236    0.09150   4.507 7.58e-06 ***
log_count    0.33294    0.03220  10.339  < 2e-16 ***
Q7_Q7_1     -0.19094    0.04013  -4.758 2.32e-06 ***
Q7_Q7_2      0.18702    0.04119   4.540 6.49e-06 ***
Q8_Q8_1     -0.09491    0.03524  -2.693 0.007223 ** 
Q10          0.11965    0.03488   3.430 0.000635 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.8994 on 791 degrees of freedom
Multiple R-squared:  0.1993,    Adjusted R-squared:  0.1912 
F-statistic: 24.61 on 8 and 791 DF,  p-value: < 2.2e-16
anova(fit, fit.full)
Analysis of Variance Table

Model 1: log_total ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10
Model 2: log_total ~ Group + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + 
    Q10
  Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
1    794 659.37                                  
2    791 639.79  3    19.585 8.0715 2.664e-05 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# histograms for two factors
boxplot(log_total ~ Group * phase, data = df, xlab="Group.Phase", ylab="log_total")

with(df, interaction.plot(Group, phase, log_total, ylim=c(0, max(log_total)))) # interaction plot

fit.lmer <- lmer(log_total ~ (1 | Group) + (1| phase:Group), data = df, REML= FALSE)
summary(fit.lmer)
Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's method ['lmerModLmerTest']
Formula: log_total ~ (1 | Group) + (1 | phase:Group)
   Data: df

     AIC      BIC   logLik deviance df.resid 
  2254.0   2272.7  -1123.0   2246.0      796 

Scaled residuals: 
     Min       1Q   Median       3Q      Max 
-2.94437 -0.01942  0.28792  0.61016  1.43336 

Random effects:
 Groups      Name        Variance Std.Dev.
 phase:Group (Intercept) 0.03675  0.1917  
 Group       (Intercept) 0.02060  0.1435  
 Residual                0.94281  0.9710  
Number of obs: 800, groups:  phase:Group, 20; Group, 4

Fixed effects:
             Estimate Std. Error        df t value Pr(>|t|)
(Intercept) -0.006239   0.090377  3.950735  -0.069    0.948
library(lmerTest)
fit.lmer <- lmer(log_total ~ Group  + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 + ( 1 | phase), data = df, REML= FALSE)
summary(fit.lmer)
Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's method ['lmerModLmerTest']
Formula: log_total ~ Group + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 +      Q10 + (1 | phase)
   Data: df

     AIC      BIC   logLik deviance df.resid 
  2071.8   2123.3  -1024.9   2049.8      789 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-3.7050 -0.2260  0.2540  0.5958  1.8126 

Random effects:
 Groups   Name        Variance Std.Dev.
 phase    (Intercept) 0.05292  0.2300  
 Residual             0.74729  0.8645  
Number of obs: 800, groups:  phase, 5

Fixed effects:
             Estimate Std. Error        df t value Pr(>|t|)    
(Intercept)  -0.26867    0.12105   8.07626  -2.219 0.056926 .  
Group1        0.36895    0.08852 794.99855   4.168 3.41e-05 ***
Group2        0.26970    0.08841 794.99199   3.051 0.002360 ** 
Group3        0.40980    0.08795 794.99463   4.659 3.72e-06 ***
log_count     0.35508    0.03114 796.55633  11.402  < 2e-16 ***
Q7_Q7_1      -0.18830    0.03857 795.00761  -4.882 1.27e-06 ***
Q7_Q7_2       0.18553    0.03959 794.99658   4.686 3.28e-06 ***
Q8_Q8_1      -0.09636    0.03387 794.99799  -2.845 0.004556 ** 
Q10           0.11832    0.03353 794.99712   3.529 0.000441 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Effects:
          (Intr) Group1 Group2 Group3 lg_cnt Q7_Q7_1 Q7_Q7_2 Q8_Q8_
Group1    -0.381                                                   
Group2    -0.381  0.512                                            
Group3    -0.383  0.526  0.525                                     
log_count  0.021 -0.063 -0.011 -0.041                              
Q7_Q7_1    0.002 -0.046  0.046 -0.012  0.096                       
Q7_Q7_2    0.017  0.008 -0.088 -0.008 -0.053 -0.560                
Q8_Q8_1    0.024 -0.022 -0.050 -0.056 -0.060 -0.086  -0.162        
Q10       -0.006 -0.082  0.095  0.018 -0.056 -0.004  -0.135  -0.274
anova(fit.lmer)
Type III Analysis of Variance Table with Satterthwaite's method
          Sum Sq Mean Sq NumDF  DenDF  F value    Pr(>F)    
Group     19.278   6.426     3 794.99   8.5989 1.272e-05 ***
log_count 97.157  97.157     1 796.56 130.0129 < 2.2e-16 ***
Q7_Q7_1   17.809  17.809     1 795.01  23.8318 1.271e-06 ***
Q7_Q7_2   16.407  16.407     1 795.00  21.9558 3.281e-06 ***
Q8_Q8_1    6.048   6.048     1 795.00   8.0936 0.0045564 ** 
Q10        9.306   9.306     1 795.00  12.4524 0.0004414 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
library(multcomp)
summary(glht(fit.lmer, linfct = mcp(Group = "Tukey")), test = adjusted("holm"))

     Simultaneous Tests for General Linear Hypotheses

Multiple Comparisons of Means: Tukey Contrasts


Fit: lmer(formula = log_total ~ Group + log_count + Q7_Q7_1 + Q7_Q7_2 + 
    Q8_Q8_1 + Q10 + (1 | phase), data = df, REML = FALSE)

Linear Hypotheses:
           Estimate Std. Error z value Pr(>|z|)    
1 - 0 == 0  0.36895    0.08852   4.168 0.000154 ***
2 - 0 == 0  0.26970    0.08841   3.051 0.009136 ** 
3 - 0 == 0  0.40980    0.08795   4.659  1.9e-05 ***
2 - 1 == 0 -0.09925    0.08739  -1.136 0.512120    
3 - 1 == 0  0.04085    0.08594   0.475 0.634555    
3 - 2 == 0  0.14010    0.08597   1.630 0.309475    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Adjusted p values reported -- holm method)
fit.lmer.reduced <- lmer(log_total ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 + ( 1 | phase), data = df, REML=FALSE)
summary(fit.lmer.reduced)
Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's method ['lmerModLmerTest']
Formula: log_total ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 + (1 |      phase)
   Data: df

     AIC      BIC   logLik deviance df.resid 
  2091.2   2128.7  -1037.6   2075.2      792 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-3.4850 -0.2298  0.2612  0.5992  1.8832 

Random effects:
 Groups   Name        Variance Std.Dev.
 phase    (Intercept) 0.05319  0.2306  
 Residual             0.77150  0.8784  
Number of obs: 800, groups:  phase, 5

Fixed effects:
              Estimate Std. Error         df t value Pr(>|t|)    
(Intercept) -8.153e-16  1.077e-01  4.992e+00   0.000 1.000000    
log_count    3.640e-01  3.156e-02  7.966e+02  11.532  < 2e-16 ***
Q7_Q7_1     -1.827e-01  3.902e-02  7.950e+02  -4.683 3.33e-06 ***
Q7_Q7_2      1.861e-01  3.998e-02  7.950e+02   4.655 3.80e-06 ***
Q8_Q8_1     -8.804e-02  3.435e-02  7.950e+02  -2.563 0.010547 *  
Q10          1.226e-01  3.351e-02  7.950e+02   3.658 0.000271 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Effects:
          (Intr) lg_cnt Q7_Q7_1 Q7_Q7_2 Q8_Q8_
log_count  0.000                              
Q7_Q7_1    0.000  0.092                       
Q7_Q7_2    0.000 -0.050 -0.556                
Q8_Q8_1    0.000 -0.062 -0.085  -0.167        
Q10        0.000 -0.066 -0.021  -0.120  -0.273
anova(fit.lmer.reduced, fit.lmer)
Data: df
Models:
fit.lmer.reduced: log_total ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 + (1 | phase)
fit.lmer: log_total ~ Group + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 + (1 | phase)
                 npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)    
fit.lmer.reduced    8 2091.2 2128.7 -1037.6   2075.2                         
fit.lmer           11 2071.8 2123.3 -1024.9   2049.8 25.388  3  1.281e-05 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

User Requirement Score

# phase is nested within group 
fit.requirement.full <- lmer(log_user_requirement ~  factor(Group) + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 + ( 1 | phase), data = df, REML = FALSE)
Anova(fit.requirement.full, type=3, test.statistics="F")
Analysis of Deviance Table (Type III Wald chisquare tests)

Response: log_user_requirement
                Chisq Df Pr(>Chisq)    
(Intercept)    2.5959  1  0.1071398    
factor(Group) 17.5649  3  0.0005407 ***
log_count     65.8539  1  4.856e-16 ***
Q7_Q7_1       23.4133  1  1.307e-06 ***
Q7_Q7_2        8.1810  1  0.0042331 ** 
Q8_Q8_1        0.1217  1  0.7271747    
Q10            9.7952  1  0.0017497 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
fit.requirement.full
Linear mixed model fit by maximum likelihood  ['lmerModLmerTest']
Formula: log_user_requirement ~ factor(Group) + log_count + Q7_Q7_1 +  
    Q7_Q7_2 + Q8_Q8_1 + Q10 + (1 | phase)
   Data: df
      AIC       BIC    logLik  deviance  df.resid 
 2139.609  2191.140 -1058.805  2117.609       789 
Random effects:
 Groups   Name        Std.Dev.
 phase    (Intercept) 0.2436  
 Residual             0.9018  
Number of obs: 800, groups:  phase, 5
Fixed Effects:
   (Intercept)  factor(Group)1  factor(Group)2  factor(Group)3       log_count         Q7_Q7_1         Q7_Q7_2  
      -0.20567         0.30406         0.15157         0.34699         0.26363        -0.19470         0.11814  
       Q8_Q8_1             Q10  
      -0.01233         0.10947  
# histograms for two factors
boxplot(log_user_requirement ~ Group * phase, data = df, xlab="Group.Phase", ylab="log_user_requirement")

with(df, interaction.plot(Group, phase, log_user_requirement, ylim=c(0, max(log_user_requirement)))) # interaction plot

# phase is nested within group 
fit.requirement <- lmer(log_user_requirement ~  log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 +  ( 1 | phase), data = df, REML = FALSE)
Anova(fit, type=3, test.statistics="F")
Anova Table (Type III tests)

Response: log_total
            Sum Sq  Df  F value    Pr(>F)    
(Intercept)   0.00   1   0.0000  1.000000    
log_count    91.62   1 110.3216 < 2.2e-16 ***
Q7_Q7_1      17.39   1  20.9366 5.507e-06 ***
Q7_Q7_2      16.97   1  20.4327 7.117e-06 ***
Q8_Q8_1       4.90   1   5.9015  0.015349 *  
Q10          10.59   1  12.7473  0.000378 ***
Residuals   659.37 794                       
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
fit.requirement
Linear mixed model fit by maximum likelihood  ['lmerModLmerTest']
Formula: log_user_requirement ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 +      Q10 + (1 | phase)
   Data: df
      AIC       BIC    logLik  deviance  df.resid 
 2150.984  2188.461 -1067.492  2134.984       792 
Random effects:
 Groups   Name        Std.Dev.
 phase    (Intercept) 0.2442  
 Residual             0.9117  
Number of obs: 800, groups:  phase, 5
Fixed Effects:
(Intercept)    log_count      Q7_Q7_1      Q7_Q7_2      Q8_Q8_1          Q10  
 -1.007e-14    2.720e-01   -1.869e-01    1.142e-01   -6.273e-03    1.177e-01  
plot(resid(m, type = "pearson") ~ fitted(m))

qqnorm(resid(m, type = "pearson"))
qqline(resid(m, type = "pearson"))

anova(fit.requirement, fit.requirement.full)
Data: df
Models:
fit.requirement: log_user_requirement ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 + (1 | phase)
fit.requirement.full: log_user_requirement ~ factor(Group) + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 + (1 | phase)
                     npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)    
fit.requirement         8 2151.0 2188.5 -1067.5   2135.0                         
fit.requirement.full   11 2139.6 2191.1 -1058.8   2117.6 17.374  3  0.0005919 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
---
title: "R Notebook"
output:
  pdf_document: default
  html_notebook: default
---

This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code. 

Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Cmd+Shift+Enter*. 

```{r}
# library for LMM 

library(lme4)
library(lmerTest)
library(car)
```



```{r}
df<-read.csv("input/scores_commits.csv", header =TRUE, sep=",")
df <- df[complete.cases(df), ]                  # Apply complete.cases function
df
```

```{r}
# convert to nominal factor
df$Group = factor(df$Group)
df$phase = factor(df$phase)
```


```{r}
df$log_novelty <- log(df$novelty+1) 
df$log_user_requirement <- log(df$user.requirement+1)
df$log_infovis <- log(df$infovis+1)
df$log_total <- log(df$total+1)
df$log_count <- log(df$count+1)
df$Q7_Q7_1 <- log(df$Q7_Q7_1+1)
df$Q7_Q7_2 <- log(df$Q7_Q7_2+1)
df$Q8_Q8_1 <- log(df$Q8_Q8_1+1)
df$Q10 <- log(df$Q10+1)
```


```{r}
# standardizing variables for skills and aspirations. 
cols <- c("Q7_Q7_1", "Q7_Q7_2", "Q8_Q8_1", "Q10", "log_novelty", "log_user_requirement", "log_infovis", "log_total", "log_count")
df[cols] <- scale(df[cols])
df
```
```{r}
mod.reduce.novelty <- lmer( log_novelty ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 + ( 1 | phase ), data = df, REML = FALSE)
summary(mod.reduce.novelty)
AIC(mod.reduce.novelty)
BIC(mod.reduce.novelty)

```


```{r}
mod.full.novelty <- lmer( log_novelty ~ factor(Group) + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 + (1 | phase), data = df, REML = FALSE)
summary(mod.full.novelty)
AIC(mod.full.novelty)
BIC(mod.full.novelty)
anova(mod.full.novelty)

```

```{r}
anova(mod.reduce.novelty, mod.full.novelty)
```
```{r}
library(ALSM)
step(lm(log_novelty ~ factor(Group) + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10, data=df),
method="both", trace = 1 )
```

```{r}
mod.reduce.ur <- lm(log_user_requirement ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 , data = df)
summary(mod.reduce.ur)
AIC(mod.reduce.ur)
BIC(mod.reduce.ur)
```

```{r}
mod.full.ur <- lm(log_user_requirement ~ factor(Group) + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10, data = df)
summary(mod.full.ur)
AIC(mod.full.ur)
BIC(mod.full.ur)
```
```{r}
anova(mod.reduce.ur, mod.full.ur)
```
```{r}
library(ALSM)
step(lm(log_user_requirement ~ factor(Group) + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10, data=df),
method="both", trace = 1 )
```



```{r}
mod.reduce.vis <- lmer( log_infovis ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 + ( 1 | phase) , data = df, REML = FALSE)
summary(mod.reduce.vis)
AIC(mod.reduce.vis)
BIC(mod.reduce.vis)
```

```{r}
mod.full.vis <- lmer( log_infovis ~ factor(Group) + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 + ( 1 | phase) , data = df, REML = FALSE)
summary(mod.full.vis)
AIC(mod.full.vis)
BIC(mod.full.vis)
```
```{r}
anova(mod.reduce.vis, mod.full.vis)
```
```{r}
library(ALSM)
step(lm(log_infovis ~ Group + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10, data=df),
method="both", trace = 1 )
```
```{r}
mod.reduce.total <- lm( log_total ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 , data = df)
summary(mod.reduce.total)
AIC(mod.reduce.total)
BIC(mod.reduce.total)
```


```{r}
mod.full.total <- lm( log_total ~ Group + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 , data = df)
summary(mod.full.total)
AIC(mod.full.total)
BIC(mod.full.total)
```
```{r}
anova(mod.reduce.total, mod.full.total)
```

```{r}
# convert to nominal factor
df$Group = factor(df$Group)
df$phase = factor(df$phase)
```

```{r}
library(plyr)
ddply(df, ~ Group * phase, function(data) summary(data$log_novelty) )
ddply(df, ~ Group * phase, summarise, log_novelty.mean=mean(log_novelty), log_novelty.sd = sd(log_novelty))
```
```{r}
# histograms for two factors
boxplot(log_novelty ~ Group * phase, data = df, xlab="Group.Phase", ylab="log_novelty")
with(df, interaction.plot(Group, phase, log_novelty, ylim=c(0, max(log_novelty)))) # interaction plot
```

```{r}
# histograms for two factors
boxplot(log_novelty ~ Group * phase, data = df, xlab="Group.Phase", ylab="log_user_requirement")
with(df, interaction.plot(Group, phase, log_user_requirement, ylim=c(0, max(log_user_requirement)))) # interaction plot
```

```{r}
# histograms for two factors
boxplot(log_novelty ~ Group * phase, data = df, xlab="Group.Phase", ylab="log_infovis")
with(df, interaction.plot(Group, phase, log_infovis, ylim=c(0, max(log_infovis)))) # interaction plot
```

```{r}
# histograms for two factors
boxplot(log_novelty ~ Group * phase, data = df, xlab="Group.Phase", ylab="log_total")
with(df, interaction.plot(Group, phase, log_total, ylim=c(0, max(log_total)))) # interaction plot
```






```{r}
m = lmer(log_novelty ~ Group + (1|Student), data=df, REML=FALSE)
summary(m)
```
```{r}
plot(resid(m, type = "pearson") ~ fitted(m))
qqnorm(resid(m, type = "pearson"))
qqline(resid(m, type = "pearson"))
```
```{r}
# library for LMM we will use on relational log_novelty 

library(lme4)
library(lmerTest)
library(car)
```

# set sum-to-zero contrast for ANOVA cells 

```{r}
contrasts(df$Group) <= "contr.sum"
contrasts(df$phase) <= "contr.sum"
```

```{r}
# phase is nested within group 
fit <- lm(log_total ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 , data = df)
summary(fit)
```

```{r}
library(multcomp)
library(lsmeans)
#summary(glht(fit, lsm(pairwise ~ roup / phase)), test = adjusted(type='holm'))
```

```{r}
fit.full <- lm(log_total ~ Group  + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10, data = df)
summary(fit.full)
```

```{r}
anova(fit, fit.full)
```
```{r}
# histograms for two factors
boxplot(log_total ~ Group * phase, data = df, xlab="Group.Phase", ylab="log_total")
with(df, interaction.plot(Group, phase, log_total, ylim=c(0, max(log_total)))) # interaction plot
```
```{r}
fit.lmer <- lmer(log_total ~ (1 | Group) + (1| phase:Group), data = df, REML= FALSE)
summary(fit.lmer)
```



```{r}
library(lmerTest)
fit.lmer <- lmer(log_total ~ Group  + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 + ( 1 | phase), data = df, REML= FALSE)
summary(fit.lmer)
anova(fit.lmer)
```

```{r}
library(multcomp)
summary(glht(fit.lmer, linfct = mcp(Group = "Tukey")), test = adjusted("holm"))
```


```{r}
fit.lmer.reduced <- lmer(log_total ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 + ( 1 | phase), data = df, REML=FALSE)
summary(fit.lmer.reduced)
```

```{r}
anova(fit.lmer.reduced, fit.lmer)
```


# User Requirement Score

```{r}
# phase is nested within group 
fit.requirement.full <- lmer(log_user_requirement ~  factor(Group) + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 + ( 1 | phase), data = df, REML = FALSE)
Anova(fit.requirement.full, type=3, test.statistics="F")
fit.requirement.full
```

```{r}
# histograms for two factors
boxplot(log_user_requirement ~ Group * phase, data = df, xlab="Group.Phase", ylab="log_user_requirement")
with(df, interaction.plot(Group, phase, log_user_requirement, ylim=c(0, max(log_user_requirement)))) # interaction plot
```


```{r}
# phase is nested within group 
fit.requirement <- lmer(log_user_requirement ~  log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 +  ( 1 | phase), data = df, REML = FALSE)
Anova(fit, type=3, test.statistics="F")
fit.requirement
```

```{r}
plot(resid(m, type = "pearson") ~ fitted(m))
qqnorm(resid(m, type = "pearson"))
qqline(resid(m, type = "pearson"))
```

```{r}                                        
anova(fit.requirement, fit.requirement.full)
```
 
